{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example of using CatBoost on text data with word2vec embedding."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import catboost\n",
    "import collections\n",
    "import gensim\n",
    "import os\n",
    "import nltk\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import tensorflow as tf\n",
    "import zipfile\n",
    "\n",
    "from scipy.stats import skew, kurtosis\n",
    "from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import log_loss, roc_auc_score\n",
    "\n",
    "from urllib import urlretrieve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_path = '../data/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Embedding"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train word2vec embeddings using Tensorflow ([from this example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/5_word2vec.ipynb))."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load [Text8](http://mattmahoney.net/dc/textdata) data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "url = 'http://mattmahoney.net/dc/'\n",
    "filename = 'text8.zip'\n",
    "filename, _ = urlretrieve(url + filename, data_path + filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with zipfile.ZipFile(data_path + filename) as f:\n",
    "    words = tf.compat.as_str(f.read(f.namelist()[0])).split()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Build a dataset. Rare words are replaced with 'UNK' token."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocabulary_size = 50000\n",
    "count = [['UNK', -1]]\n",
    "count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
    "dictionary = dict()\n",
    "\n",
    "for word, _ in count:\n",
    "    dictionary[word] = len(dictionary)\n",
    "data = list()\n",
    "unk_count = 0\n",
    "for word in words:\n",
    "    if word in dictionary:\n",
    "        index = dictionary[word]\n",
    "    else:\n",
    "        index = 0  # dictionary['UNK']\n",
    "        unk_count = unk_count + 1\n",
    "    data.append(index)\n",
    "\n",
    "count[0][1] = unk_count\n",
    "reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) \n",
    "del words"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Write batch generator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_index = 0\n",
    "\n",
    "def generate_batch(batch_size, num_skips, skip_window):\n",
    "    global data_index\n",
    "    batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
    "    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
    "    span = 2 * skip_window + 1  # [ skip_window target skip_window ]\n",
    "    buf = collections.deque(maxlen=span)\n",
    "    for _ in xrange(span):\n",
    "        buf.append(data[data_index])\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    for i in xrange(batch_size // num_skips):\n",
    "        target = skip_window  # target label at the center of the buffer\n",
    "        targets_to_avoid = [ skip_window ]\n",
    "        for j in xrange(num_skips):\n",
    "            while target in targets_to_avoid:\n",
    "                target = random.randint(0, span - 1)\n",
    "            targets_to_avoid.append(target)\n",
    "            batch[i * num_skips + j] = buf[skip_window]\n",
    "            labels[i * num_skips + j, 0] = buf[target]\n",
    "        buf.append(data[data_index])\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    return batch, labels"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train a skip-gram model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedding_size = 128  # Dimension of the embedding vector.\n",
    "skip_window = 1       # How many words to consider left and right.\n",
    "num_skips = 2         # How many times to reuse an input to generate a label.\n",
    "num_sampled = 64      # Number of negative examples to sample.\n",
    "\n",
    "graph = tf.Graph()\n",
    "\n",
    "with graph.as_default(), tf.device('/cpu:0'):\n",
    "    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])\n",
    "    train_labels  = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "  \n",
    "    embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "    softmax_weights = tf.Variable(\n",
    "        tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/np.sqrt(embedding_size)))\n",
    "    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
    "  \n",
    "    embed = tf.nn.embedding_lookup(embeddings, train_dataset)\n",
    "    loss = tf.reduce_mean(\n",
    "        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,\n",
    "                                   labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))\n",
    "\n",
    "    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)\n",
    "\n",
    "    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
    "    normalized_embeddings = embeddings / norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average loss at step 100000: 3.453762\n",
      "Average loss at step 200000: 3.241983\n",
      "Average loss at step 300000: 3.176724\n",
      "Average loss at step 400000: 3.131535\n",
      "Average loss at step 500000: 3.077026\n"
     ]
    }
   ],
   "source": [
    "num_steps = 500001\n",
    "\n",
    "with tf.Session(graph=graph) as session:\n",
    "    tf.global_variables_initializer().run()\n",
    "    average_loss = 0\n",
    "    for step in range(num_steps):\n",
    "        batch_data, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n",
    "        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}\n",
    "        _, l = session.run([optimizer, loss], feed_dict=feed_dict)\n",
    "        average_loss += l\n",
    "        if step % 100000 == 0 and step > 0:\n",
    "            print('Average loss at step %d: %f' % (step, average_loss / 100000))\n",
    "            average_loss = 0\n",
    "    word2vec = normalized_embeddings.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check trained word2vec: find nearest for car."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cars vehicle motorcycle train vehicles\n"
     ]
    }
   ],
   "source": [
    "distances = -word2vec[dictionary['car']].reshape((1, -1)).dot(word2vec.T)\n",
    "inds = np.argsort(distances.ravel())[1:6]\n",
    "print(' '.join([reverse_dictionary[i] for i in inds]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Also you can:\n",
    "1. Change parameters of model.\n",
    "2. Change dataset to bigger one.\n",
    "3. Increase train time.\n",
    "4. Use pretrained model (not only word2vec)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load dataset from [Kaggle Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs/overview) competition. The goal of this task is to determine which pair of questions is duplicated (binary classification)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>qid1</th>\n",
       "      <th>qid2</th>\n",
       "      <th>question1</th>\n",
       "      <th>question2</th>\n",
       "      <th>is_duplicate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>What is the step by step guide to invest in sh...</td>\n",
       "      <td>What is the step by step guide to invest in sh...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
       "      <td>What would happen if the Indian government sto...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>How can I increase the speed of my internet co...</td>\n",
       "      <td>How can Internet speed be increased by hacking...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>Why am I mentally very lonely? How can I solve...</td>\n",
       "      <td>Find the remainder when [math]23^{24}[/math] i...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>Which one dissolve in water quikly sugar, salt...</td>\n",
       "      <td>Which fish would survive in salt water?</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  qid1  qid2                                          question1  \\\n",
       "0   0     1     2  What is the step by step guide to invest in sh...   \n",
       "1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   \n",
       "2   2     5     6  How can I increase the speed of my internet co...   \n",
       "3   3     7     8  Why am I mentally very lonely? How can I solve...   \n",
       "4   4     9    10  Which one dissolve in water quikly sugar, salt...   \n",
       "\n",
       "                                           question2  is_duplicate  \n",
       "0  What is the step by step guide to invest in sh...             0  \n",
       "1  What would happen if the Indian government sto...             0  \n",
       "2  How can Internet speed be increased by hacking...             0  \n",
       "3  Find the remainder when [math]23^{24}[/math] i...             0  \n",
       "4            Which fish would survive in salt water?             0  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv(data_path + 'train.csv').fillna('')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target = data.is_duplicate\n",
    "data.drop(['is_duplicate', 'id', 'qid1', 'qid2'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data.question1 = data.question1.apply(lambda x: x.lower().decode('utf-8'))\n",
    "data.question2 = data.question2.apply(lambda x: x.lower().decode('utf-8'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature extraction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Nltk for tokenizer and stop-words filtering."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/ekayumov/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /home/ekayumov/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "stop_words = nltk.corpus.stopwords.words('english')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get a vector of every question as:\n",
    "1. Tokenizing\n",
    "2. Filtering from stop-words and non-words\n",
    "3. Summig vectors of words and normilizing it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "EPS = 1e-100\n",
    "\n",
    "def question2vec(s):\n",
    "    words = nltk.word_tokenize(s)\n",
    "    words = filter(lambda x: not x in stop_words and x.isalpha(), words)\n",
    "    seq = np.array([word2vec[dictionary[w]] for w in words if w in dictionary])\n",
    "    v = seq.sum(axis=0)\n",
    "    return v / ((v ** 2).sum() + EPS) ** 0.5 if seq.shape[0] != 0 else np.ones(embedding_size)*1.0/embedding_size**0.5\n",
    "\n",
    "question1_vec = np.array([question2vec(q) for q in data.question1.values])\n",
    "question2_vec = np.array([question2vec(q) for q in data.question2.values])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can not only average vectors but also find max, min and std for all question."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate features on embeddings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "data['cosine']     = [cosine(x, y)       for (x, y) in zip(question1_vec, question2_vec)]\n",
    "data['cityblock']  = [cityblock(x, y)    for (x, y) in zip(question1_vec, question2_vec)]\n",
    "data['canberra']   = [canberra(x, y)     for (x, y) in zip(question1_vec, question2_vec)]\n",
    "data['euclidean']  = [euclidean(x, y)    for (x, y) in zip(question1_vec, question2_vec)]\n",
    "data['minkowski']  = [minkowski(x, y, 3) for (x, y) in zip(question1_vec, question2_vec)]\n",
    "data['braycurtis'] = [braycurtis(x, y)   for (x, y) in zip(question1_vec, question2_vec)]\n",
    "\n",
    "data['skew_q1'] = [skew(x) for x in question1_vec]\n",
    "data['skew_q2'] = [skew(x) for x in question2_vec]\n",
    "data['kur_q1']  = [kurtosis(x) for x in question1_vec]\n",
    "data['kur_q2']  = [kurtosis(x) for x in question2_vec]\n",
    "\n",
    "data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])\n",
    "data['kur_diff']  = np.abs(data['kur_q1'] - data['kur_q2'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In addition you can not only calculate metric between question but use all vectors or differences."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Generate simple features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data['len_q1'] = data.question1.apply(lambda x: len(x))\n",
    "data['len_q2'] = data.question2.apply(lambda x: len(x))\n",
    "data['len_diff'] = np.abs(data.len_q1 - data.len_q2)\n",
    "\n",
    "data['len_char_q1'] = data.question1.apply(lambda x: len(x.replace(' ', '')))\n",
    "data['len_char_q2'] = data.question2.apply(lambda x: len(x.replace(' ', '')))\n",
    "data['len_char_diff'] = np.abs(data.len_char_q1 - data.len_char_q2)\n",
    "\n",
    "data['len_uniq_char_q1'] = data.question1.apply(lambda x: len(''.join(set(x.replace(' ', '')))))\n",
    "data['len_uniq_char_q2'] = data.question2.apply(lambda x: len(''.join(set(x.replace(' ', '')))))\n",
    "data['len_uniq_char_diff'] = np.abs(data.len_uniq_char_q1 - data.len_uniq_char_q2)\n",
    "\n",
    "data['len_word_q1'] = data.question1.apply(lambda x: len(x.split()))\n",
    "data['len_word_q2'] = data.question2.apply(lambda x: len(x.split()))\n",
    "data['len_word_diff'] = np.abs(data.len_word_q1 - data.len_word_q2)\n",
    "\n",
    "data['len_uniq_word_q1'] = data.question1.apply(lambda x: len(set(x.split())))\n",
    "data['len_uniq_word_q2'] = data.question2.apply(lambda x: len(set(x.split())))\n",
    "data['len_uniq_word_diff'] = np.abs(data.len_uniq_word_q1 - data.len_uniq_word_q2)\n",
    "\n",
    "data['common_words']  = data.apply(lambda x: len(set(x['question1'].split()).intersection(set(x['question2'].split()))), axis=1)\n",
    "data['union_words']   = data.apply(lambda x: len(set(x['question1'].split()).union(set(x['question2'].split()))), axis=1)\n",
    "data['jaccard_words'] = data.common_words / (data.union_words + EPS)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train and check model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split dataset to train and validation parts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train, test, y_train, y_test = train_test_split(data.drop(['question1', 'question2'], axis=1), target, test_size=0.2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train CatBoost and check prediction on validation part."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<catboost.catboost._CatBoostBase at 0x7fab9d768350>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = catboost.CatBoostClassifier(depth=6, iterations=1000, learning_rate=0.1, thread_count=16)\n",
    "clf.fit(train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC: 0.826508171695\n"
     ]
    }
   ],
   "source": [
    "y_pred = clf.predict_proba(test)[:, 1]\n",
    "print 'AUC:', roc_auc_score(y_test, y_pred)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
